librarian::shelf(
  DT, dplyr, dismo, GGally, here, readr, tidyr)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)

dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)

Learning Objectives

Explore (cont’d)

Pairs Plot

Show correlations between variables.

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))
Pairs plot with `present` color coded.

Pairs plot with present color coded.

Logistic Regression

Full model workflow with split, fit, predict and evaluate process steps.

Setup Data

Let’s setup a data frame with only the data we want to model by:

  • Dropping rows with any NAs. Later we’ll learn how to “impute” values with guesses so as to not throw away data.
  • Removing terms we don’t want to model. We can then use a simplified formula \(present \sim .\) to predict \(present\) based on all other fields in the data frame (i.e. the \(X\)`s in \(y \sim x_1 + x_2 + ... x_n\)).
# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19912

Linear Model

Let’s start as simply as possible with a linear model lm() on multiple predictors X to predict presence y using a simpler workflow.

Simpler workflow with only fit and predict of all data, i.e. no splitting.

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28329 -0.09504  0.01992  0.13295  0.86890 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept)  1.541635705  0.148412953   10.39 <0.0000000000000002 ***
## WC_alt      -0.000507343  0.000009035  -56.16 <0.0000000000000002 ***
## WC_bio1     -0.081418278  0.001422555  -57.23 <0.0000000000000002 ***
## WC_bio2     -0.029554373  0.000979939  -30.16 <0.0000000000000002 ***
## ER_tri      -0.002474917  0.000123228  -20.08 <0.0000000000000002 ***
## ER_topoWet  -0.034489623  0.002817935  -12.24 <0.0000000000000002 ***
## lon         -0.043608547  0.001170300  -37.26 <0.0000000000000002 ***
## lat         -0.105980684  0.001060644  -99.92 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2465 on 19904 degrees of freedom
## Multiple R-squared:  0.7571, Adjusted R-squared:  0.757 
## F-statistic:  8864 on 7 and 19904 DF,  p-value: < 0.00000000000000022
y_predict <- predict(mdl, d, type="response")
y_true    <- d$present

range(y_predict)
## [1] -0.5193748  1.3127056
range(y_true)
## [1] 0 1

The problem with these predictions is that it ranges outside the possible values of present 1 and absent 0. (Later we’ll deal with converting values within this range to either 1 or 0 by applying a cutoff value; i.e. any values > 0.5 become 1 and below become 0.)

Generalized Linear Model

To solve this problem of constraining the response term to being between the two possible values, i.e. the probability \(p\) of being one or the other possible \(y\) values, we’ll apply the logistic transformation on the response term.

\[ logit(p_i) = \log_{e}\left( \frac{p_i}{1-p_i} \right) \]

We can expand the expansion of the predicted term, i.e. the probability \(p\) of being either \(y\), with all possible predictors \(X\) whereby each coeefficient \(b\) gets multiplied by the value of \(x\):

\[ \log_{e}\left( \frac{p_i}{1-p_i} \right) = b_0 + b_1 x_{1,i} + b_2 x_{2,i} + \cdots + b_k x_{k,i} \]

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.4334  -0.0049   0.0000   0.1261   3.2228  
## 
## Coefficients:
##                 Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept) -169.1651358    6.1541922 -27.488 < 0.0000000000000002 ***
## WC_alt        -0.0016905    0.0002623  -6.446       0.000000000115 ***
## WC_bio1        0.5523077    0.0428240  12.897 < 0.0000000000000002 ***
## WC_bio2       -0.8249942    0.0280555 -29.406 < 0.0000000000000002 ***
## ER_tri        -0.0277293    0.0032753  -8.466 < 0.0000000000000002 ***
## ER_topoWet    -0.7398075    0.0554387 -13.345 < 0.0000000000000002 ***
## lon           -1.8891459    0.0557322 -33.897 < 0.0000000000000002 ***
## lat           -1.2155325    0.0311678 -39.000 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27603.6  on 19911  degrees of freedom
## Residual deviance:  4549.7  on 19904  degrees of freedom
## AIC: 4565.7
## 
## Number of Fisher Scoring iterations: 9
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 0.0000000000000002220446 0.9999712687694529700266

Excellent, our response is now constrained between 0 and 1. Next, let’s look at the term plots to see the relationship between predictor and response.

# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")

Generalized Additive Model

With a generalized additive model we can add “wiggle” to the relationship between predictor and response by introducing smooth s() terms.

librarian::shelf(mgcv)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -33.22      29.62  -1.122    0.262
## 
## Approximate significance of smooth terms:
##                 edf Ref.df Chi.sq              p-value    
## s(WC_alt)     3.135  3.643  68.83 < 0.0000000000000002 ***
## s(WC_bio1)    8.052  8.137  45.45 < 0.0000000000000002 ***
## s(WC_bio2)    7.915  8.314 196.31 < 0.0000000000000002 ***
## s(ER_tri)     6.216  7.199  24.10              0.00123 ** 
## s(ER_topoWet) 5.481  6.669  30.49            0.0000399 ***
## s(lon)        7.751  7.989 179.68 < 0.0000000000000002 ***
## s(lat)        8.907  8.995 354.86 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.917   Deviance explained = 89.1%
## UBRE = -0.84373  Scale est. = 1         n = 19912
# show term plots
plot(mdl, scale=0)

HELP

Maxent (Maximum Entropy)

Maxent is probably the most commonly used species distribution model (Elith 2011) since it performs well with few input data points, only requires presence points (and samples background for comparison) and is easy to use with a Java graphical user interface (GUI).

# load extra packages
librarian::shelf(
  maptools, sf)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")

# show version of maxent
if (!interactive())
  maxent()
## Loading required namespace: rJava
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
  mdl <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)

# plot variable contributions per predictor
plot(mdl)

# plot term plots
response(mdl)

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Notice how the plot() function produces different outputs depending on the class of the input object. You can view help for each of these with R Console commands: ?plot.lm, ?plot.gam and plot,DistModel,numeric-method.